Code
library(tidyverse)
library(janitor)
library(ggplot2)
library(dplyr)
library(rugarch)
library(gganimate)Train 80 %-> real train 75% ,weight train 5% (weight from t-1)
test 20%
library(tidyverse)
library(janitor)
library(ggplot2)
library(dplyr)
library(rugarch)
library(gganimate)stock <- read.csv("data/individual_book_train/stock_100.csv")stock <- stock %>% mutate(
WAP = (bid_price1 * ask_size1 + ask_price1 * bid_size1) / (bid_size1 + ask_size1)
)
stock <- stock %>% mutate(
BidAskSpread = ask_price1 / bid_price1 - 1
)
stock <- stock %>% mutate(
imbalance = abs((bid_size1 - ask_size1) / (bid_size1 + ask_size1))
)
log_rs <- list()
imba_mean <- vector()
BAS_mean <- vector()
#time_IDs <- unique(stock$time_id)
time_IDs <- unique(stock[, 1])[1:20]
for (i in 1 : length(time_IDs)) {
sec <- stock %>% filter(time_id == time_IDs[i]) %>% pull(seconds_in_bucket)
price <- stock %>% filter(time_id == time_IDs[i]) %>% pull(WAP)
imbad <- stock %>% filter(time_id == time_IDs[i]) %>% pull(imbalance)
BASD <- stock %>% filter(time_id == time_IDs[i]) %>% pull(BidAskSpread)
imba_mean[[i]] <- mean(imbad)
BAS_mean[[i]] <- mean(BASD)
log_r <- log(price[-1] / price[1:(length(price) - 1)])
log_rs[[i]] <- data.frame(time = sec[-1], log_return = log_r)
time.no.change <- (1:600)[!(1:600 %in% log_rs[[i]]$time)]
if (length(time.no.change) > 0) {
new.df <- data.frame(time = time.no.change, log_return = 0)
log_rs[[i]] <- rbind(log_rs[[i]], new.df)
log_rs[[i]] <- log_rs[[i]][order(log_rs[[i]]$time), ]
}
}
vol <- list()
comp_vol <- function(x) {
return(sqrt(sum(x ^ 2)))
}
for (i in 1 : length(log_rs)) {
log_rs[[i]] <- log_rs[[i]] %>% mutate(time_bucket = ceiling(time / 30))
vol[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_vol)
colnames(vol[[i]]) <- c('time_bucket', 'volatility')
}#put into cluster base on Plot ***Eye balling***
cluster_l <- vector()
for (i in 1:length(vol)) {
if (BAS_mean[[i]] > 0.15){cluster_l <- 4}
else if (imba_mean[[i]] > 0.61) {cluster_l[[i]] <- 3}
else if (imba_mean[[i]] < 0.45) {cluster_l[[i]] <- 2}
else {cluster_l[[i]] <- 1}
}
cluster_l [1] 3 3 2 1 3 1 2 1 2 2 1 1 1 3 2 1 3 2 2 2
model
spec <- ugarchspec(variance.model = list(model = "eGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(0, 0)),
distribution.model = "norm")
ARMA_GARCH.models <- list()
# filter time 450 for first 75% train
for (i in 1 : length(vol)) {
ARMA_GARCH.models[[i]] <- ugarchfit(spec = spec, data = log_rs[[i]] %>%
filter(time <= 450) %>% pull(log_return),
solver = 'hybrid')
}
# 30weight 120 real predict
n_w = 30
n_p = 120
pred1 <- list()
weight_train <- list()
for (i in 1 : length(vol)) {
fitted <- rep(1,n_w)
weight_train[[i]] <- data_frame(fitted)
fitted <- rep(1,n_p)
pred1[[i]] <- data_frame(fitted)
fspec <- getspec(ARMA_GARCH.models[[i]])
setfixed(fspec) <- as.list(coef(ARMA_GARCH.models[[i]]))
future.path <- fitted(ugarchpath(fspec, n.sim = 150, m.sim = 1000))
future.path[is.na(future.path)] <- 0
for(j in 1: nrow(future.path)) {
if (j <= n_w){
weight_train[[i]]$fitted[j] <- mean(sqrt(mean(future.path[j,] ^ 2)))
}else{
pred1[[i]]$fitted[j-n_w] <- mean(sqrt(mean(future.path[j,] ^ 2)))
}
}
}
garch_weight <- vector()
for (i in 1:length(vol)){
garch_weight[i] <- mean(weight_train[[i]]$fitted)
}
pred1_adjust <- list()
for(i in 1:length(vol)){
fitted <- rep(1:4)
pred1_adjust[[i]] <- data_frame(fitted)
pred1_adjust[[i]]$fitted[1] <- mean(pred1[[i]]$fitted[1:30])
pred1_adjust[[i]]$fitted[2] <- mean(pred1[[i]]$fitted[31:60])
pred1_adjust[[i]]$fitted[3] <- mean(pred1[[i]]$fitted[61:91])
pred1_adjust[[i]]$fitted[4] <- mean(pred1[[i]]$fitted[91:120])
}vol.train <- list()
vol.val <- list()
vol.w <- list()
for (i in 1 : length(log_rs)) {
vol.train[[i]] <- vol[[i]][1:15, ]
vol.val[[i]] <- vol[[i]][-(1:15), ]
}
list.reg <- list()
stocklm <- stock %>% mutate(time_bucket = ceiling(seconds_in_bucket / 30),
num_order = bid_size1 + ask_size1 + bid_size2 + ask_size2)
len.train <- length(vol.train[[1]]$volatility)
for (i in 1 : length(vol)) {
stats.bucket <- stocklm %>%
filter(time_id == time_IDs[i] & time_bucket != 0) %>%
select(c(BidAskSpread, WAP, num_order, time_bucket))
mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
list.reg[[i]] <- data.frame(volatility = vol.train[[i]]$volatility[-1],
price = mean.price$WAP[1:(len.train - 1)],
order = mean.order$num_order[1:(len.train - 1)],
BidAskSpread = mean.BAS$BidAskSpread[1:(len.train - 1)])
}
lm.models <- list()
for (i in 1 : length(vol)) {
lm.models[[i]] <- lm(volatility ~ price + order + BidAskSpread, list.reg[[i]],
weights = 0.8 ^ (((len.train - 2):0) / 2))
}list.reg.val <- list()
len.val <- length(vol.val[[1]]$volatility)
pred.lm <- list()
for (i in 1 : length(vol)) {
stats.bucket <- stocklm %>%
filter(time_id == time_IDs[i] & time_bucket != 0) %>%
select(c(BidAskSpread, WAP, num_order, time_bucket))
mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
list.reg.val[[i]] <-
data.frame(volatility = vol.val[[i]]$volatility,
price = mean.price$WAP[len.train:(len.train + len.val - 1)],
order = mean.order$num_order[len.train:(len.train + len.val - 1)],
BidAskSpread = mean.BAS$BidAskSpread[len.train:(len.train + len.val - 1)])
pred.lm[[i]] <- predict(lm.models[[i]], newdata = list.reg.val[[i]])
}
pred2 <- pred.lmlist.HAV <- list()
for (i in 1 : length(vol)) {
mean.vol <- rep(0, len.train - 5)
for (j in 1 : 5) {
mean.vol <- mean.vol + vol.train[[i]]$volatility[j : (j + len.train - 6)] / 5
}
list.HAV[[i]] <- data.frame(vol = vol.train[[i]]$volatility[-(1:5)],
vol_1 = vol.train[[i]]$volatility[5:(len.train - 1)],
mean_vol_5 = mean.vol)
}
quar <- list()
comp_quar <- function(x) {
return(length(x) / 3 * sum(x ^ 4))
}
for (i in 1 : length(log_rs)) {
quar[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_quar)
colnames(quar[[i]]) <- c('time_bucket', 'quarticity')
}
HAV.wls.models <- list()
for (i in 1 : length(vol)) {
HAV.wls.models[[i]] <- lm(vol ~ vol_1 + mean_vol_5, list.HAV[[i]],
weights = list.HAV[[i]]$vol_1 /
sqrt(quar[[i]]$quarticity[5:(len.train - 1)]))
}pred.hav.all <- list()
for (j in 1:1) {
pred.hav <- list()
latest_obs <- list()
list_HAV1_cluster <- list()
for (i in 1:length(vol)) {
# This will predict 16, 17, 18, 19, 20
latest_obs[[i]] <- vol.train[[i]]$volatility[11:15]
for (t in 1:5) {
# Compute mean volatility for the last 5 observations
mean.vol <- sum(latest_obs[[i]])/5
# Create data frame with updated vol_1 and mean_vol_5
list_HAV1_cluster[[i]] <- data.frame(
vol_1 = latest_obs[[i]][5],
mean_vol_5 = mean.vol
)
pred.hav[[t]] <- unname(predict(HAV.wls.models[[i]], newdata = list_HAV1_cluster[[i]]))
# Drop the oldest observation and add new predicted value
latest_obs[[i]] <- c(latest_obs[[i]][-1], pred.hav[[t]])
}
#cluster_pred_lm[[j]][[i]] <- latest_obs
}
pred.hav.all[[j]] <- latest_obs
}
#pred.hav.all[[1]][[1]][[1]]
pred3 <- list()
for (i in 1:length(vol)){
pred3[[i]] <- pred.hav.all[[1]][[i]]
}let
cluster 1,4 = egarch + wlr
cluster 2 = wlr + hav
cluster 3 = egarch + hav
#garch_weight #garch 16
#pred1_adjust[[1]]$fitted #garch 17-20
#pred2[[1]] #wlr 16-20
#pred3[[1]] #hav 16-20
mix <- list()
for(i in 1:length(vol)){
pred_f <- rep(1,4)
mod_a <- rep(1,4)
mod_b <- rep(1,4)
alpha_w <- rep(1,4)
beta_w <- rep(1,4)
val <- rep(1,4)
time <- c(17,18,19,20)
mix[[i]] <- data.frame(time,pred_f,mod_a,mod_b,alpha_w,beta_w,val)
###val
mix[[i]]$val <- vol.val[[i]]$volatility[2:5]
if(cluster_l[[i]] == 1){
mix[[i]]$mod_a <- pred1_adjust[[i]]$fitted
mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
pa <- garch_weight[[i]]
pb <- pred2[[i]][[1]]
}
else if(cluster_l[[i]] == 2){
mix[[i]]$mod_a <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
mix[[i]]$mod_b <- pred3[[i]][2:5]
pa <- pred2[[i]][[1]]
pb <- pred3[[i]][[1]]
}
else if(cluster_l[[i]] == 3){
mix[[i]]$mod_a <- pred1_adjust[[i]]$fitted
mix[[i]]$mod_b <- pred3[[i]][2:5]
pa <- garch_weight[[i]]
pb <- pred3[[i]][[1]]
}
else {
mix[[i]]$mod_a <- pred1_adjust[[i]]$fitted
mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
pa <- garch_weight[[i]]
pb <- pred2[[i]][[1]]
}
#16 -> 17
a = 0
b = 1
sm_err = 99999
best_a = 0
for(w in 1:11){
m_cal <- a*pa + b*pb
ab_err <- abs(m_cal - vol.val[[i]]$volatility[[1]])
if(ab_err < sm_err){
sm_err <- ab_err
best_a <- a
}
a <- a+0.1
b <- b-0.1
}
mix[[i]]$alpha_w[[1]] <- best_a
mix[[i]]$beta_w[[1]] <- round(1-best_a,digit = 1)
#17-19 -> 18-20 alpha
for(j in 1:3){
a = 0
b = 1
sm_err = 99999
best_a = 0
for(w in 1:11){
m_cal <- a*mix[[i]]$mod_a[[j]] + b*mix[[i]]$mod_b[[j]]
ab_err <- abs(m_cal - vol.val[[i]]$volatility[[j+1]])
if(ab_err < sm_err){
sm_err <- ab_err
best_a <- a
}
a <- a+0.1
b <- b-0.1
}
mix[[i]]$alpha_w[[j+1]] <- best_a
mix[[i]]$beta_w[[j+1]] <- round(1-best_a,digit = 1)
}
###mix
mix[[i]]$pred_f <- ((mix[[i]]$mod_a*mix[[i]]$alpha_w) + (mix[[i]]$mod_b*mix[[i]]$beta_w))
}value table
mix[[1]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0010260675 0.0002041338 0.0010260675 0.0 1.0 0.0015946080
2 18 0.0010015944 0.0002025417 0.0010015944 0.0 1.0 0.0007010344
3 19 0.0006652719 0.0002032407 0.0009732926 0.4 0.6 0.0007203055
4 20 0.0007790745 0.0002006405 0.0010269748 0.3 0.7 0.0008264389
[[2]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0004069278 0.0001228562 0.0005286728 0.3 0.7 0.0006225340
2 18 0.0005574654 0.0001244393 0.0005574654 0.0 1.0 0.0003323998
3 19 0.0003381953 0.0001239375 0.0005524532 0.5 0.5 0.0001328152
4 20 0.0001222956 0.0001222956 0.0005043462 1.0 0.0 0.0001088957
[[3]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0011106308 0.0011106308 0.0006334167 1.0 0.0 0.0011320570
2 18 0.0009069263 0.0009069263 0.0006068875 1.0 0.0 0.0010788897
3 19 0.0010573743 0.0010573743 0.0006246518 1.0 0.0 0.0007937135
4 20 0.0007921832 0.0010463319 0.0006227508 0.4 0.6 0.0007345000
[[4]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0006207086 4.093892e-05 0.0006207086 0.0 1.0 0.0004778847
2 18 0.0004724636 4.059793e-05 0.0005804300 0.2 0.8 0.0005119734
3 19 0.0005093462 4.050979e-05 0.0005614391 0.1 0.9 0.0007271090
4 20 0.0006149317 4.064493e-05 0.0006149317 0.0 1.0 0.0001894792
[[5]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0002619497 0.0001235271 0.0002965554 0.2 0.8 0.0001584394
2 18 0.0002028148 0.0001240457 0.0005178913 0.8 0.2 0.0007691548
3 19 0.0004737087 0.0001250849 0.0004737087 0.0 1.0 0.0006672344
4 20 0.0005147503 0.0001237248 0.0005147503 0.0 1.0 0.0008105287
[[6]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0012676115 0.0003257864 0.0018954948 0.4 0.6 0.0013235207
2 18 0.0008424771 0.0003245704 0.0011877482 0.4 0.6 0.0009859261
3 19 0.0010500945 0.0003237403 0.0012316830 0.2 0.8 0.0011174693
4 20 0.0006996395 0.0003202424 0.0007417947 0.1 0.9 0.0015402672
[[7]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.002108687 0.002108687 0.001259999 1.0 0.0 0.002893711
2 18 0.001994708 0.001994708 0.001082938 1.0 0.0 0.002303492
3 19 0.002135026 0.002135026 0.001335652 1.0 0.0 0.001550287
4 20 0.002004055 0.002338768 0.001860607 0.3 0.7 0.001924196
[[8]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0011555697 0.0002493622 0.001543944 0.3 0.7 0.0009875863
2 18 0.0009969878 0.0002489362 0.001495689 0.4 0.6 0.0012000349
3 19 0.0012040838 0.0002460939 0.001443581 0.2 0.8 0.0015288547
4 20 0.0013881996 0.0002477074 0.001388200 0.0 1.0 0.0013299081
[[9]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0005861145 0.0006079186 0.0005352381 0.7 0.3 0.0005099655
2 18 0.0006875757 0.0006925670 0.0006875757 0.0 1.0 0.0010484798
3 19 0.0007425936 0.0007425936 0.0005836357 1.0 0.0 0.0008481560
4 20 0.0006439829 0.0006439829 0.0006301278 1.0 0.0 0.0005676446
[[10]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0006051743 0.0006691367 0.0006051743 0.0 1.0 0.0004338248
2 18 0.0006214556 0.0004130729 0.0006214556 0.0 1.0 0.0006972630
3 19 0.0006285788 0.0003937966 0.0006285788 0.0 1.0 0.0004923812
4 20 0.0007861862 0.0008848413 0.0006382035 0.6 0.4 0.0006493527
[[11]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0003253447 0.0001964894 0.0004542000 0.5 0.5 0.0012770825
2 18 0.0010498573 0.0001952542 0.0010498573 0.0 1.0 0.0013026599
3 19 0.0011927043 0.0001957932 0.0011927043 0.0 1.0 0.0008389981
4 20 0.0005502512 0.0001967262 0.0007859345 0.4 0.6 0.0002842181
[[12]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.002346960 0.0004987428 0.002346960 0.0 1.0 0.001265334
2 18 0.001040957 0.0004951248 0.001859706 0.6 0.4 0.002095780
3 19 0.002107611 0.0004944819 0.002107611 0.0 1.0 0.003082850
4 20 0.002415752 0.0005006603 0.002415752 0.0 1.0 0.001890465
[[13]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0009188474 0.0002020980 0.001226026 0.3 0.7 0.0013525503
2 18 0.0013726390 0.0002012416 0.001372639 0.0 1.0 0.0008768654
3 19 0.0008431131 0.0002021022 0.001270454 0.4 0.6 0.0012045310
4 20 0.0009882325 0.0002027464 0.001075509 0.1 0.9 0.0007419980
[[14]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0007927571 0.0003023523 0.0008472465 0.1 0.9 0.0007970520
2 18 0.0007908454 0.0003072274 0.0008445807 0.1 0.9 0.0001358754
3 19 0.0003051468 0.0003051468 0.0008426602 1.0 0.0 0.0008990069
4 20 0.0008387691 0.0003095749 0.0008387691 0.0 1.0 0.0008202798
[[15]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.001714043 0.001357333 0.001714043 0 1 0.001831960
2 18 0.001386287 0.001560010 0.001386287 0 1 0.001643141
3 19 0.001521939 0.001521939 0.001368533 1 0 0.001586889
4 20 0.001767904 0.001767904 0.001527690 1 0 0.001668429
[[16]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0005572954 0.0002777904 0.0006770833 0.3 0.7 0.0013167039
2 18 0.0008307884 0.0002820644 0.0008307884 0.0 1.0 0.0009321254
3 19 0.0009860273 0.0002809864 0.0009860273 0.0 1.0 0.0008096176
4 20 0.0004877660 0.0002794405 0.0005770483 0.3 0.7 0.0010109599
[[17]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0005496654 7.610765e-05 0.0008653706 0.4 0.6 0.0016703868
2 18 0.0018215213 7.567910e-05 0.0018215213 0.0 1.0 0.0011431227
3 19 0.0005635098 7.663819e-05 0.0008880908 0.4 0.6 0.0009476834
4 20 0.0018334588 7.598328e-05 0.0018334588 0.0 1.0 0.0021947340
[[18]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.002497587 0.001782293 0.002497587 0.0 1.0 0.003299317
2 18 0.002621825 0.001912485 0.002621825 0.0 1.0 0.001602806
3 19 0.001338368 0.001338368 0.002798108 1.0 0.0 0.002310615
4 20 0.002407931 0.001655130 0.002730561 0.3 0.7 0.001912041
[[19]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0007837318 0.0007837318 0.0009926786 1.0 0.0 0.0007625953
2 18 0.0007486862 0.0007486862 0.0006048890 1.0 0.0 0.0006761409
3 19 0.0008523670 0.0014565198 0.0002482143 0.5 0.5 0.0008144871
4 20 0.0003353993 0.0004255822 0.0002452164 0.5 0.5 0.0009530073
[[20]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0013848057 0.0013848057 0.0009930291 1 0 0.0009485897
2 18 0.0008194832 0.0008127469 0.0008194832 0 1 0.0003690774
3 19 0.0014807308 0.0014807308 0.0008166304 1 0 0.0006364187
4 20 0.0008099396 0.0014257837 0.0008099396 0 1 0.0009045089
plot
all_plot <- list()
for(i in 1:length(vol)){
weight_p = ""
for(j in 1:nrow(mix[[i]])){
weight_p = paste(
weight_p,as.character(mix[[i]]$time[[j]]),"=","(",
as.character(mix[[i]]$alpha_w[[j]],1),":",as.character(mix[[i]]$beta_w[[j]],1),")"
)
}
all_plot[[i]] <- ggplot(mix[[i]], aes(x=time)) +
geom_line(aes(y = val,color = "Real Volatility"))+
geom_line(aes(y = mod_a,color = "Model a"), linetype="twodash")+
geom_line(aes(y = mod_b,color = "Model b"), linetype="twodash")+
geom_line(aes(y = pred_f,color = "Mix Model"), linetype="twodash")+
scale_color_manual(name = "Model", values = c(
"Real Volatility" = "red",
"Model a"="lightblue",
"Model b"="green",
"Mix Model" = "blue"))+
theme_classic()+
labs(
title = paste("Prediction Result"),
tag = as.character(i),
subtitle = paste("cluster ",as.character(cluster_l[[i]]),
if(cluster_l[[i]] == 1){mod = "EGARCH + WLR"}
else if (cluster_l[[i]] == 2){mod = "WLR + HAV"}
else if (cluster_l[[i]] == 3){mod = "EGARCH + HAV"}
else {mod = "EGARCH + WLR"},
"\n weight for each time interval : \n",
weight_p
),
x = "Time interval",
y = "Volatility",
caption = "each time interval = 30 seconds"
)#+
#transition_reveal(time)
}all_plot[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
[[10]]
[[11]]
[[12]]
[[13]]
[[14]]
[[15]]
[[16]]
[[17]]
[[18]]
[[19]]
[[20]]
all_plot[[5]]+transition_reveal(time)